Rows: 17379 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (13): Season, Hour, Holiday, Day of the Week, Working Day, Weather Type,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotts.sample.wge(bike_data$`Total Users`)$xbar
[1] 189.4631
# make the bike data daily instead of hourly daily_bike_data <- bike_data %>%group_by(Date =as.Date(Date, format ="%m/%d/%Y")) %>%summarise(Season =mean(Season),Hour =mean(Hour),Holiday =mean(Holiday),Day_of_the_Week =mean(`Day of the Week`),Working_Day =mean(`Working Day`),Weather_Type =mean(`Weather Type`),Temperature =mean(`Temperature F`),Temperature_Feels =mean(`Temperature Feels F`),Humidity =mean(Humidity),Wind_Speed =mean(`Wind Speed`),Casual_Users =sum(`Casual Users`),Registered_Users =sum(`Registered Users`),Total_Users =sum(`Total Users`), )plotts.sample.wge(daily_bike_data$Total_Users)$xbar
[1] 4504.349
Models
Univariate: At least 2 candidate ARMA / ARIMA models
ARMA Model
# create model plotts.sample.wge(daily_bike_data$Total_Users)$xbar # looking at the spectral density it looks like there may be wandering behavior
[1] 4504.349
aic5.wge(daily_bike_data$Total_Users,type ='bic', p =0:10) # bic picked p = 7 and q = 0
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.69076
1 2 13.69813
3 1 13.69839
2 2 13.70129
4 1 13.70332
arma_est =est.arma.wge(daily_bike_data$Total_Users, p =4, q =1)
Coefficients of AR polynomial:
1.3314 -0.3662 -0.0398 0.0714
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.9958B 1.0042 0.9958 0.0000
1-0.6451B+0.2316B^2 1.3928+-1.5422i 0.4812 0.1331
1+0.3094B -3.2316 0.3094 0.5000
Coefficients of MA polynomial:
0.8546
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1-0.8546B 1.1701 0.8546 0.0000
arma_est$aic
[1] 13.66561
# Checking the residuals to see if the model is appropriate plotts.wge(arma_est$res) # looks random
acf(arma_est$res) # all acfs are within the bounds
# plot of Last N forecasts for short and long term horizon arma_stfore1 =fore.arima.wge(daily_bike_data$Total_Users, phi = arma_est$phi, theta = arma_est$theta, n.ahead =7, lastn =TRUE)
# ASE arma_stASE =mean((daily_bike_data$Total_Users[725:731]-arma_stfore1$f)^2)arma_ltASE =mean((daily_bike_data$Total_Users[672:731]-arma_ltfore1$f)^2)arma_stASE
[1] 3358954
arma_ltASE
[1] 3058217
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# arma_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = arma_est$phi, theta = arma_est$theta, s = 2)$rwRMSE# arma_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = arma_est$phi, theta = arma_est$theta, s = 2)$rwRMSEarma_strwRMSE =1674.472arma_ltrwRMSE =6297.147arma_stfore2 =fore.arima.wge(daily_bike_data$Total_Users, phi = arma_est$phi, theta = arma_est$theta, n.ahead =7)
# Plots of the short and Long Term Forecasts t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Short Term Forecast", xlim =c(670,745), xlab ="Time", ylab ="Total Users")points(t[732:738],arma_stfore2$f, type ='l', col ='blue')points(t[732:738],arma_stfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],arma_stfore2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Long Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:791],arma_ltfore2$f, type ='l', col ='blue')points(t[732:791],arma_ltfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],arma_ltfore2$ul, type ='l',lwd=2, lty =2, col ='red')
Non-Seasonal ARIMA Model
# Check if the data could be stationaryadf.test(daily_bike_data$Total_Users) # p-value 0.7, data not likely stationary
Augmented Dickey-Fuller Test
data: daily_bike_data$Total_Users
Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
alternative hypothesis: stationary
# Difference of 1total_d1 =artrans.wge(daily_bike_data$Total_Users, phi.tr =c(1))
# Model the residualsaic5.wge(total_d1, type ='bic') # bic and aic pick p = 1 and q = 1
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
1 1 13.68620
0 2 13.69304
2 1 13.69387
1 2 13.69455
3 1 13.69895
est =est.arma.wge(total_d1, p =1, q =1)
Coefficients of AR polynomial:
0.3591
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.3591B 2.7848 0.3591 0.0000
Coefficients of MA polynomial:
0.8903
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1-0.8903B 1.1232 0.8903 0.0000
plotts.sample.wge(est$res, arlimits =TRUE)$xbar # appears to be white noise
# Generate realizations for comparisonarima_gen1 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen2 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen3 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen4 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
plotts.sample.wge(arima_gen1)$xbar
[1] 7.377414
plotts.sample.wge(arima_gen2)$xbar
[1] -2.337345
plotts.sample.wge(arima_gen3)$xbar
[1] -2.396652
plotts.sample.wge(arima_gen4)$xbar # Similar behavior to original data
[1] -9.309329
# Compare Spectral Densities: sims =20SpecDen =parzen.wge(daily_bike_data$Total_Users, plot ="FALSE")plot(SpecDen$freq,SpecDen$pzgram, type ="l", lwd =6)for( i in1: sims){ SpecDen2 =parzen.wge(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(SpecDen2$freq,SpecDen2$pzgram, lwd =2, col ="red")}
# Compare ACFs:sims =20ACF =acf(daily_bike_data$Total_Users, plot ="FALSE")plot(ACF$lag ,ACF$acf , type ="l", lwd =6)for( i in1: sims){ ACF2 =acf(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(ACF2$lag ,ACF2$acf, lwd =2, col ="red")}
# Short and Long Term Forecastsfore_arima_st =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7, lastn =TRUE)
# ASE arima_stASE =mean((daily_bike_data$Total_Users[725:731]-fore_arima_st$f)^2)arima_ltASE =mean((daily_bike_data$Total_Users[672:731]-fore_arima_lt$f)^2)arima_stASE
[1] 3230238
arima_ltASE
[1] 3930824
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# arima_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = est$phi, theta = est$theta, d = 1)$rwRMSE# arima_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = est$phi, theta = est$theta, d = 1)$rwRMSEarima_strwRMSE =1237.393arima_ltrwRMSE =1503.197# Plots of the short and Long Term Forecasts fore_arima_st_2 =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7)
t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Short Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:738],fore_arima_st_2$f, type ='l', col ='blue')points(t[732:738],fore_arima_st_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],fore_arima_st_2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Long Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:791],fore_arima_lt_2$f, type ='l', col ='blue')points(t[732:791],fore_arima_lt_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],fore_arima_lt_2$ul, type ='l',lwd=2, lty =2, col ='red')
The models in factored form or at least separate the stationary and non- stationary factors with standard deviation or variance of the white noise.
AIC
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts)
At least 10 superimposed spectral densities from 10 generated realizations like we did in Unit 11. Use these to help choose between the at least two candidate univariate models.
Visualization of Forecasts for both the short- and long-term Horizons.
Be sure and include confidence intervals when possible (I don’t have code for confidence intervals from MLP models at the moment… but that would be a good thing to work on! )
Multivariate: At least one multivariate model (VAR or MLR with Correlated Errors)
Include an ASE (rolling window is not yet available in multivariate models)
Short Horizon (you pick the length.. could be one step ahead)
Long Horizon (you pick … just must be longer than the short horizon.)
Describe the explanatory variable(s) used in the model and why you felt they were significant / important.
Visualization of Forecasts for both the short- and long-Horizons.
Be sure and include confidence intervals if using VAR …
ccf(daily_bike_data$Total_Users,daily_bike_data$Humidity) # lag of 4
daily_bike_data$Humidity_lag = dplyr::lag(daily_bike_data$Humidity,4)# Short Term Prediction Fitfit1 =lm(Total_Users~Humidity_lag + Temperature + Day_of_the_Week + Wind_Speed, data = daily_bike_data[1:724,])plotts.sample.wge(fit1$residuals)$xbar
[1] 1.013076e-12
aic5.wge(fit1$residuals, type ='bic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.55385
3 1 13.56279
2 2 13.56438
4 1 13.56665
3 2 13.56898
mlr1 =arima(daily_bike_data$Total_Users[1:724], order =c(2,0,1),xreg =cbind(daily_bike_data$Humidity_lag[1:724], daily_bike_data$Temperature[1:724], daily_bike_data$Day_of_the_Week[1:724], daily_bike_data$Wind_Speed[1:724]))plotts.wge(mlr1$residuals) # looks random
acf(mlr1$residuals[-(1:5)]) # none of the acfs out of bounds
# Long Term Prediction Fitfit2 =lm(Total_Users~Humidity_lag + Temperature + Day_of_the_Week + Wind_Speed, data = daily_bike_data[1:671,])plotts.sample.wge(fit2$residuals)$xbar
[1] -3.858876e-13
aic5.wge(fit2$residuals, type ='bic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.56183
3 1 13.57159
2 2 13.57250
1 2 13.57421
4 1 13.57535
mlr2 =arima(daily_bike_data$Total_Users[1:671], order =c(2,0,1),xreg =cbind(daily_bike_data$Humidity_lag[1:671], daily_bike_data$Temperature[1:671], daily_bike_data$Day_of_the_Week[1:671], daily_bike_data$Wind_Speed[1:671]))plotts.wge(mlr2$residuals) # looks random
acf(mlr2$residuals[-(1:5)]) # only 1/20 acfs out of bounds
aic5.wge(tempdiff, type ='bic') # bic picked p = 1 and q = 1
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
1 1 3.714381
2 0 3.720397
3 0 3.722154
0 2 3.723965
1 2 3.726426
temp =est.arma.wge(tempdiff, p =1, q =1)
Coefficients of AR polynomial:
0.4839
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.4839B 2.0667 0.4839 0.0000
Coefficients of MA polynomial:
-0.3906
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1+0.3906B -2.5605 0.3906 0.5000
acf(temp$res)
ljung.wge(temp$res) # close to 0.05 but Fail to reject
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
t =1:800plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(670,738), main ="Short Term VAR Forecast")points(t[732:738], var_st_pred2$fcst$Total_Users[,1], type="l", lwd=2, lty =1, col ='blue')
Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
t =1:800plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(670,791), main ="Long Term VAR Forecast")points(t[732:791], var_lt_pred2$fcst$Total_Users[,1], type="l", lwd=2, lty =1, col ='blue')
MLP Model
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts (only if univariate)
Visualization of Forecasts for both the short- and long-term Horizons.
Confidence / Prediction intervals are not required (I don’t have code for confidence / prediction intervals (bootstrap intervals) for MLP models at the moment… but that would be a good thing to work on! )
# Convert tibble to a data frame with ts() objectsbike_DF <- daily_bike_data %>% dplyr::select(-c(Date, Hour, Holiday, Temperature_Feels, Casual_Users, Registered_Users)) %>%# Remove unnecessary columnsmutate(across(everything(), ~zoo(.x, order.by = daily_bike_data$Date))) %>%# Convert each column to a zoo objectmutate(across(everything(), ~as.ts(.x))) %>%# Convert zoo objects to ts objectsas.data.frame() # Convert the tibble to a data framebikeShortTrain = bike_DF[1:724,]bikeShortTest = bike_DF[725:731,]bikeLongTrain = bike_DF[1:671,]bikeLongTest = bike_DF[672:731,]seed =137
# Forecast Short-term predictor variables using MLPset.seed(seed)fit.mlp.short.Season =mlp(ts(bikeShortTrain[,"Season"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Season =forecast(fit.mlp.short.Season, h =7)plot(fore.mlp.short.Season)
fit.mlp.short.Day_of_the_Week =mlp(ts(bikeShortTrain[,"Day_of_the_Week"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Day_of_the_Week =forecast(fit.mlp.short.Day_of_the_Week, h =7)fore.mlp.short.Day_of_the_Week$mean =round(fore.mlp.short.Day_of_the_Week$mean) # Round to nearest whole numberplot(fore.mlp.short.Day_of_the_Week)
fit.mlp.short.Working_Day =mlp(ts(bikeShortTrain[,"Working_Day"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Working_Day =forecast(fit.mlp.short.Working_Day, h =7)fore.mlp.short.Working_Day$mean =round(fore.mlp.short.Working_Day$mean) # Round to nearest whole numberplot(fore.mlp.short.Working_Day)
# Forecast and evaluate ASE for short and longfore.mlp.st =forecast(fit.mlp.st, h =7, xreg = BDF_fore_short)fore.mlp.lt =forecast(fit.mlp.lt, h =60, xreg = BDF_fore_long)ASE.mlp.st =mean((bikeShortTest$Total_Users - fore.mlp.st$mean)^2)ASE.mlp.lt =mean((bikeLongTest$Total_Users - fore.mlp.lt$mean)^2)print(paste("MLP ASE Short Term: ", round(ASE.mlp.st, 2)))
[1] "MLP ASE Short Term: 4177344.82"
print(paste("MLP ASE Long Term: ", round(ASE.mlp.lt, 2)))
[1] "MLP ASE Long Term: 10000456.14"
plot(fore.mlp.st)
plot(fore.mlp.lt)
# Plot the forecastst =1:731plot(t[720:731],bike_DF$Total_Users[720:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Short Term Forecast")lines(t[725:731], fore.mlp.st$mean, type="l", lwd=2, lty =2, col ='blue')
plot(t[600:731],bike_DF$Total_Users[600:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Long Term Forecast")lines(t[672:731], fore.mlp.lt$mean, type="l", lwd=2, lty =2, col ='blue')
Ensemble Model
# Code for Figure 11.37# TODO: Adopt this for ours#Ensemble#VAR p = 7 non seasonalCMortVAR7 =VAR(cardiacTrain, type ="both", p =7) #p = 2 from SBCpreds7=predict(CMortVAR7,n.ahead=156)RMSEVAR7 =sqrt(mean((cardiacTest[,"cmort"] - preds7$fcst$cmort[,1])^2))RMSEVAR7 # 6.664#VAR p = 2 seasonalCMortVAR2S =VAR(cardiacTrain, season =52, type ="both", p =2) #p = 2 from SBCpreds2S=predict(CMortVAR2S,n.ahead=156)ensemble = (preds2S$fcst$cmort[,1] + fore.mlp.cmort$mean)/2#Plotplot(seq(1,508,1), cardiac[,"cmort"], type ="l",xlim =c(350,508), ylim =c(70,110), xlab ="Time", ylab ="Cardiac Mortality", main ="52 Week Cardiac Mortality Forecast From A VAR/MLP Ensemble")lines(seq(353,508,1), ensemble, type ="l", lwd =4, col ="green")lines(seq(353,508,1),preds2S$fcst$cmort[,1] , type ="l", lwd =2, lty =2, col ="red")lines(seq(353,508,1),fore.mlp.cmort$mean , type ="l", lwd =2, lty =4, col ="blue")lines(seq(353,508,1),preds7$fcst$cmort[,1] , type ="l", lwd =2, lty =2, col ="purple")RMSEVAR7 =sqrt(mean((cardiacTest[,"cmort"] - preds7$fcst$cmort[,1])^2))RMSEVAR7RMSEVAR2S =sqrt(mean((cardiacTest[,"cmort"] - preds2S$fcst$cmort[,1])^2))RMSEVAR2SRMSE =sqrt(mean((cardiacTest[,"cmort"] - fore.mlp.cmort$mean)^2))RMSERMSEENSEMBLE =sqrt(mean((cardiacTest[,"cmort"] - ensemble)^2))RMSEENSEMBLE # 5.64, 5,81
Model Comparison and Final Forecasts
Provide a table comparing all models on at least ASE and rwRMSE (if available).
Include at least one ensemble model in addition to the models above.
Make a case as to why you feel one of your candidate models is the most useful.
Provide you final short and long term forecasts with that model.
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows